# Program input:
# (1) inputFileName = "/data5/team/obukai/the_new_trip_of_feature_generation/gov2ClearYourMindAndDoItAgain/freqOfTermsInQueries.txt"

# Program output:
# (1) outputFileName = "/data5/team/obukai/the_new_trip_of_feature_generation/gov2ClearYourMindAndDoItAgain/queryLexicon.batchQueryCompatibleMode.txt"

# Output File Analysis:
# the file called queryLexicon.batchQueryCompatibleMode.txt contains 38871 unique query term and they are batch Query Compatible with the original polyIRToolkit
# this file can be as the input feed to the polyIRToolkit and let the toolkit output some freq Of Terms In Collection (e.g. in gov2 or clueweb09)
# the output file name for gov2 dataset is: /data3/obukai/the_new_trip_of_feature_generation/gov2ClearYourMindAndDoItAgain/wholeLexiconTermsWithTermFreqInCollection.txt

from struct import *
import sys

print "Begin:"

inputFileName = "/data5/team/obukai/the_new_trip_of_feature_generation/gov2ClearYourMindAndDoItAgain/freqOfTermsInQueries.txt"
inputFileHandler = open(inputFileName,"r")

outputFileName = "/data5/team/obukai/the_new_trip_of_feature_generation/gov2ClearYourMindAndDoItAgain/queryLexicon.batchQueryCompatibleMode.txt"
outputFileHandler = open(outputFileName,"w")

for index,line in enumerate( inputFileHandler.readlines() ):
    queryTermLexicon = line.strip().split(" ")[0]
    outputFileHandler.write( str(index) + ":" + queryTermLexicon + "\n")
    print queryTermLexicon


inputFileHandler.close()
outputFileHandler.close()


print "End..."

